home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Amiga Format CD 46
/
Amiga Format CD46 (1999-10-20)(Future Publishing)(GB)[!][issue 1999-12].iso
/
-serious-
/
comms
/
www
/
urlx
/
urlx.c
< prev
next >
Wrap
C/C++ Source or Header
|
1999-09-06
|
11KB
|
563 lines
/*
* program to extract URL's from any file
*/
#include <sys/types.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include <errno.h>
#include "ubi_BinTree.h"
typedef struct
{
ubi_trNode Node;
char Name[0];
} NameRec;
static ubi_trRoot Root;
FILE *f1_p;
FILE *f2_p;
#define BUFSIZE 65536
static const char PROGVER[] =
"\0$VER: urlx V1.00 ** (c) Aug 99 ** frans";
static const char USAGE[] =
"usage: urlx [options] <infile> [outfile]\n\n"
"options:\n"
" -h output as html file\n"
" -s sort output\n"
" -u special url sorted output\n"
" -p keep parameters after url's\n"
" -d allow duplicate url's\n"
" -a allow accented characters\n"
" -.<ext> output only matching extension(s)\n"
" -i output only .html etc entries\n"
"\n"
" <infile> can be - to indicate stdin";
void shutdown(char *msg, ...)
{
vfprintf(stderr, msg, (void *)(&msg + 1));
exit(0);
}
void usage(void)
{
shutdown("%s\n\n%s\n\n", PROGVER + 7, USAGE);
}
typedef struct memChunk
{
struct memChunk *next;
u_char data[0];
} memChunk;
memChunk *xmc = 0;
void *xmalloc(long len)
{
memChunk *mc;
if ((mc = (memChunk *)malloc(sizeof(memChunk) + len)) == 0)
shutdown("out of memory\n");
mc->next = xmc;
xmc = mc;
return ((void *)mc->data);
}
void xfree(void *mem)
{
memChunk *mc, *this, *prev, *next;
if (mem)
mc = (memChunk *)((char *)mem - sizeof(memChunk));
else
mc = 0;
for (this = xmc, prev = 0; this != 0; this = next)
{
next = this->next;
if (mc == 0 || mc == this)
{
if (prev)
prev->next = next;
else
xmc = next;
free(this);
if (mc)
break;
}
else
prev = this;
}
}
void cleanup(void)
{
if (f1_p && f1_p != stdin)
fclose(f1_p);
if (f2_p && f2_p != stdout)
fclose(f2_p);
xfree(0);
}
/*---------------------------------------------------------------
* btree stuff
*/
int str_compare(char *s1, int s1len, char *s2, int s2len)
{
int result = strnicmp(s1, s2, (s1len < s2len)? s1len : s2len);
if (result == 0)
{
if (s1len > s2len)
return (1);
else if (s2len > s1len)
return (-1);
}
return (result);
}
static int CompareFunc(ubi_trItemPtr ItemPtr, ubi_trNodePtr NodePtr)
{
char *Name = (char *)ItemPtr;
char *NodeName = ((NameRec *)NodePtr)->Name;
return (stricmp(Name, NodeName));
}
int get_first_bit(char *p, char **pp, int *pplen)
{
char *p0, *p1;
int p1len;
for (p0 = p, p1 = 0, p1len = 0; *p != 0; p++)
{
if (*p == ':' && p1 == 0)
{
p += 2; p1 = p + 1; p1len = p1 - p0;
}
else if (*p == '/' && p1 != 0)
{
break;
}
}
if (p1len)
{
*pp = p1;
*pplen = p - p1;
}
return (p1len);
}
int get_last_bit(char *p, int plen, char **pp)
{
char *p0, *p1;
for (p0 = p, p1 = p; p - p0 < plen; p++)
{
if (*p == '.')
{
p1 = p + 1;
}
}
*pp = p1;
return (plen - (p1 - p0));
}
/*
* special url order
*/
static int URLCompareFunc(ubi_trItemPtr ItemPtr, ubi_trNodePtr NodePtr)
{
char *Name = (char *)ItemPtr;
char *NodeName = ((NameRec *)NodePtr)->Name;
int result;
char *p1, *p2;
int p1len, p2len;
char *s1, *s2;
int s1len, s2len;
s1len = get_first_bit(Name, &p1, &p1len);
s2len = get_first_bit(NodeName, &p2, &p2len);
if ( s1len != s2len
|| strnicmp(Name, NodeName, s1len) != 0)
return str_compare(Name, strlen(Name), NodeName, strlen(NodeName));
for (;;)
{
s1len = get_last_bit(p1, p1len, &s1);
s2len = get_last_bit(p2, p2len, &s2);
if (result = str_compare(s1, s1len, s2, s2len))
return (result);
if (s1len >= p1len || s2len >= p2len)
break;
p1len -= (s1len + 1);
p2len -= (s2len + 1);
}
return str_compare(Name, strlen(Name), NodeName, strlen(NodeName));
}
static void KillNode(ubi_trNodePtr NodePtr)
{
free(NodePtr);
}
int fillbuf(u_char *buf, int buflen, char *data, int datalen)
{
if (datalen)
memcpy(buf, data, datalen);
datalen += fread(buf + datalen, 1, buflen - datalen, f1_p);
return (datalen);
}
int main(int argc, char **argv)
{
int i, quiet_f, verbose_f;
int param_ok_f, html_f, index_f, space_ok_f, sort_f, dup_ok_f, accent_ok_f;
long datalen;
FILE *f_out;
char *f1_name, *f2_name;
u_char *p;
char ext1[40], ext2[40];
u_char *buf;
int valid;
f1_p = f2_p = 0;
f1_name = f2_name = 0;
buf = 0;
memset(&Root, 0, sizeof(ubi_trRoot));
quiet_f = verbose_f = 0;
param_ok_f = html_f = index_f = space_ok_f = sort_f = dup_ok_f = accent_ok_f = 0;
ext1[0] = ext2[0] = 0;
atexit(cleanup);
for (i = 1; i < argc; i++)
{
if (verbose_f)
fprintf(stderr, "%s\n", argv[i]);
if (*(p = argv[i]) == '-')
{
switch (*++p)
{
case 'v':
verbose_f = 1;
if (*++p == 'v')
verbose_f = 2;
break;
case 'q':
quiet_f = 1;
if (*++p == 'q')
quiet_f = 2;
break;
case 'p':
param_ok_f = 1;
break;
case 'a':
accent_ok_f = 1;
break;
case 'w': /* allow white space in urls */
space_ok_f = 1;
break;
case 'i':
index_f = 1;
strcpy(ext1, "htm");
strcpy(ext2, "shtm");
break;
case 'h':
html_f = 1;
break;
case 's':
sort_f = 1;
break;
case 'u':
sort_f = 2;
break;
case 'd':
dup_ok_f = 1;
break;
case '.':
if (ext1[0] == 0)
strcpy(ext1, ++p);
else if (ext2[0] == 0)
strcpy(ext2, ++p);
else
usage();
break;
case 0:
if (!f1_p)
{
f1_p = stdin;
break;
}
default:
usage();
}
}
else if (f1_p == 0 && f1_name == 0)
f1_name = p;
else if (f2_name == 0)
f2_name = p;
else
usage();
}
if (f1_p == 0 && f1_name == 0)
usage();
if (verbose_f)
printf("%s\n\n", PROGVER + 7);
if (f1_p == 0 && (f1_p = fopen(f1_name, "rb")) == 0)
shutdown("Couldn't open input file %s\n", f1_name);
if (f2_name && (f2_p = fopen(f2_name, "wb")) == 0)
shutdown("Couldn't open output file %s\n", f2_name);
if (f2_p)
f_out = f2_p;
else
f_out = stdout;
ubi_trInitTree(&Root,
(sort_f == 2)? URLCompareFunc : CompareFunc,
0); /* Don't allow overwrites or duplicates */
buf = xmalloc(BUFSIZE);
/*
* read file into buffer and scan
*/
for (p = buf, datalen = 0, valid = 0; ; )
{
u_char *url, *lastdot;
int i_f, plen;
if (valid == -1 || (p - buf) + 8 >= datalen)
{
if (datalen && datalen < BUFSIZE)
break;
if (datalen && p - buf < datalen)
datalen = fillbuf(buf, BUFSIZE, p, datalen - (p - buf));
else
datalen = fillbuf(buf, BUFSIZE, 0, 0);
if (verbose_f)
fprintf(stderr, " filled buffer %ld\n", datalen);
if (datalen == 0)
break;
p = buf;
}
url = p;
switch (*p)
{
case 'h': case 'H':
if (strnicmp(p, "http://", 7) == 0)
plen = 7;
else if (strnicmp(p, "https://", 8) == 0)
plen = 8;
else
{
p++;
continue;
}
break;
case 'f': case 'F':
if (strnicmp(p, "ftp://", 6) == 0)
plen = 6;
else
{
p++;
continue;
}
break;
default:
p++;
continue;
}
p += plen;
for (valid = 0, lastdot = 0, i_f = -1; ; )
{
if (p - buf >= datalen)
{
valid = -1;
break;
}
if (*p < 32 || (!accent_ok_f && *p > 127))
{
if (*p == 0 || *p == 0x0A || *p == 0x0D)
valid = 1;
break;
}
else if ( *p == '\"' || *p == '\''
|| *p == '>' || *p == '<'
|| (!space_ok_f && *p == ' ')
|| (!param_ok_f && (*p == '?' || *p == '#')))
{
valid = 1;
break;
}
else if (*p == '.')
lastdot = p;
else if (*p == '/')
i_f = 1;
else if (i_f != -1)
i_f = 0;
p++;
}
if (!valid)
{
p = url + 1;
continue;
}
else if (valid == -1)
{
p = url;
continue;
}
if ( (ext1[0] == 0 && ext2[0] == 0)
|| (lastdot != 0 && ext1[0] != 0 && strnicmp(lastdot + 1, ext1, strlen(ext1)) == 0)
|| (lastdot != 0 && ext2[0] != 0 && strnicmp(lastdot + 1, ext2, strlen(ext2)) == 0)
|| index_f && i_f != 0)
{
/*
* if name already in table continue,
* else add name to table...
*/
NameRec *RecPtr, *OldRecPtr;
int urllen = p - url;
if (!(RecPtr = (NameRec *)malloc(sizeof(NameRec)+urllen+1)))
shutdown("out of memory\n");
strncpy(RecPtr->Name, url, urllen);
RecPtr->Name[urllen] = 0;
if (!ubi_trInsert(&Root,
RecPtr,
RecPtr->Name,
&OldRecPtr)
)
{
/*
* name already in table
*/
free(RecPtr);
if (!dup_ok_f)
continue;
RecPtr = OldRecPtr;
}
if (sort_f)
continue;
if (html_f > 0)
{
fprintf(f_out, "<HTML><TITLE>bookmarks</TITLE><BODY><UL>\n");
html_f = -1;
}
if (html_f)
{
fprintf(f_out, "<LI><A HREF=\"%s\">%s</A>\n",
RecPtr->Name, RecPtr->Name + plen);
}
else
{
fprintf(f_out, "%s\n", RecPtr->Name);
}
}
}
if (sort_f)
{
NameRec *RecPtr;
for (RecPtr = (NameRec *)ubi_trFirst(Root.root);
RecPtr != 0;
RecPtr = (NameRec *)ubi_trNext(RecPtr))
{
char *url;
int plen;
url = RecPtr->Name;
if (html_f > 0)
{
fprintf(f_out, "<HTML><TITLE>bookmarks</TITLE><BODY><UL>\n");
html_f = -1;
}
if (html_f)
{
if (strnicmp(url, "http://", 7) == 0)
plen = 7;
if (strnicmp(url, "https://", 8) == 0)
plen = 8;
else if (strnicmp(url, "ftp://", 6) == 0)
plen = 6;
else
plen = 0;
fprintf(f_out, "<LI><A HREF=\"%s\">%s</A>\n", url, url + plen);
}
else
{
fprintf(f_out, "%s\n", url);
}
}
}
if (html_f < 0)
{
fprintf(f_out, "</UL></BODY></HTML>\n");
}
ubi_trKillTree(&Root, KillNode);
exit(0);
}